import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics as mt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
import plotly
from plotly.graph_objs import Bar, Line
from plotly.graph_objs import Scatter, Layout
from plotly.graph_objs.scatter import Marker
from plotly.graph_objs.layout import XAxis, YAxis
from matplotlib.pyplot import scatter
from numpy.linalg import pinv
import warnings
warnings. filterwarnings('ignore', category=UserWarning)
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)
plotly.offline.init_notebook_mode()
from scipy.special import expit
import copy
from numpy import ma
from scipy.optimize import fmin_bfgs
plt.style.use('ggplot')
%matplotlib inline
Let's open and visualize our data:
df=pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2111 entries, 0 to 2110 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 2111 non-null object 1 Age 2111 non-null float64 2 Height 2111 non-null float64 3 Weight 2111 non-null float64 4 family_history_with_overweight 2111 non-null object 5 FAVC 2111 non-null object 6 FCVC 2111 non-null float64 7 NCP 2111 non-null float64 8 CAEC 2111 non-null object 9 SMOKE 2111 non-null object 10 CH2O 2111 non-null float64 11 SCC 2111 non-null object 12 FAF 2111 non-null float64 13 TUE 2111 non-null float64 14 CALC 2111 non-null object 15 MTRANS 2111 non-null object 16 NObeyesdad 2111 non-null object dtypes: float64(8), object(9) memory usage: 280.5+ KB
df
| Gender | Age | Height | Weight | family_history_with_overweight | FAVC | FCVC | NCP | CAEC | SMOKE | CH2O | SCC | FAF | TUE | CALC | MTRANS | NObeyesdad | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 21.000000 | 1.620000 | 64.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 0.000000 | 1.000000 | no | Public_Transportation | Normal_Weight |
| 1 | Female | 21.000000 | 1.520000 | 56.000000 | yes | no | 3.0 | 3.0 | Sometimes | yes | 3.000000 | yes | 3.000000 | 0.000000 | Sometimes | Public_Transportation | Normal_Weight |
| 2 | Male | 23.000000 | 1.800000 | 77.000000 | yes | no | 2.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 1.000000 | Frequently | Public_Transportation | Normal_Weight |
| 3 | Male | 27.000000 | 1.800000 | 87.000000 | no | no | 3.0 | 3.0 | Sometimes | no | 2.000000 | no | 2.000000 | 0.000000 | Frequently | Walking | Overweight_Level_I |
| 4 | Male | 22.000000 | 1.780000 | 89.800000 | no | no | 2.0 | 1.0 | Sometimes | no | 2.000000 | no | 0.000000 | 0.000000 | Sometimes | Public_Transportation | Overweight_Level_II |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | Female | 20.976842 | 1.710730 | 131.408528 | yes | yes | 3.0 | 3.0 | Sometimes | no | 1.728139 | no | 1.676269 | 0.906247 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2107 | Female | 21.982942 | 1.748584 | 133.742943 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.005130 | no | 1.341390 | 0.599270 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2108 | Female | 22.524036 | 1.752206 | 133.689352 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.054193 | no | 1.414209 | 0.646288 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2109 | Female | 24.361936 | 1.739450 | 133.346641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.852339 | no | 1.139107 | 0.586035 | Sometimes | Public_Transportation | Obesity_Type_III |
| 2110 | Female | 23.664709 | 1.738836 | 133.472641 | yes | yes | 3.0 | 3.0 | Sometimes | no | 2.863513 | no | 1.026452 | 0.714137 | Sometimes | Public_Transportation | Obesity_Type_III |
2111 rows × 17 columns
Question: Explain the task and what business-case or use-case it is designed to solve (or designed to investigate). Detail exactly what the classification task is and what parties would be interested in the results. For example, would the model be deployed or used mostly for offline analysis?
Question: Define and prepare your class variables. Use proper variable representations (int, float, one-hot, etc.). Use pre-processing methods (as needed) for dimensionality reduction, scaling, etc. Remove variables that are not needed/useful for the analysis. Describe the final dataset that is used for classification/regression (include a description of any newly formed variables you created).
As it was explainedm in the first lab, there are a number of categories, which need to have their instances converted to integers (both binary or one hot encoding type). Moreover, it is necessary to turn some of the floats to integers (features which entries should be integers, but some floats are displayed) and remove the smoking atribute.
With respect to missing or duplicated data, it is not necessary to input data and we will not deleted the few duplicated entries. Finally, we will also change the atributes names, in order to simplify them.
df.rename(columns = {'family_history_with_overweight': 'family', 'FAVC': 'caloric_food', 'FCVC': 'vegetables', 'NCP': 'n_meal', 'CAEC':'eat_bet_meal', 'SMOKE': 'smoking', 'CH2O' : 'water', 'SCC' : 'monitor_cal', 'FAF' : 'physical_active', 'TUE': 'screen_time', 'CALC': 'alcohol', 'MTRANS': 'transport', 'NObeyesdad': 'obesity'}, inplace=True)
df.Age=round(df.Age).astype(int)
df.water=round(df.water).astype(int)
df.physical_active=round(df.physical_active).astype(int)
df.screen_time=round(df.screen_time).astype(int)
df.vegetables=df.vegetables.astype(int)
df.n_meal=df.n_meal.astype(int)
del df['smoking']
Before we proceed, let's take a look about how the Obesity Levels are distributed in our population.
df_grouped=df.groupby(by='obesity')
for val,grp in df_grouped:
print('There are',len(grp),'with obesity level:',val)
There are 272 with obesity level: Insufficient_Weight There are 287 with obesity level: Normal_Weight There are 351 with obesity level: Obesity_Type_I There are 297 with obesity level: Obesity_Type_II There are 324 with obesity level: Obesity_Type_III There are 290 with obesity level: Overweight_Level_I There are 290 with obesity level: Overweight_Level_II
tmp_df=pd.get_dummies(df.family,prefix='family')
df=pd.concat((df,tmp_df),axis=1)
del df['family']
tmp_df=pd.get_dummies(df.caloric_food,prefix='caloric_food')
df=pd.concat((df,tmp_df),axis=1)
del df['caloric_food']
tmp_df=pd.get_dummies(df.monitor_cal,prefix='monitor_cal')
df=pd.concat((df,tmp_df),axis=1)
del df['monitor_cal']
tmp_df=pd.get_dummies(df.transport,prefix='transport')
df=pd.concat((df,tmp_df),axis=1)
del df['transport']
tmp_df=pd.get_dummies(df.Gender,prefix='Gender')
df=pd.concat((df,tmp_df),axis=1)
del df['Gender']
frequency=['no','Sometimes','Frequently','Always']
interval=range(4)
df.eat_bet_meal.replace(to_replace=frequency[:],
value=interval,
inplace=True)
df.alcohol.replace(to_replace=frequency[:],
value=interval,
inplace=True)
df.vegetables=abs(df.vegetables[:]-3)
df.physical_active=abs(df.physical_active[:]-3)
obesity_levels=['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
'Obesity_Type_III']
df.obesity.replace(to_replace=obesity_levels[:],
value=range(7),
inplace=True)
df
| Age | Height | Weight | vegetables | n_meal | eat_bet_meal | water | physical_active | screen_time | alcohol | ... | caloric_food_yes | monitor_cal_no | monitor_cal_yes | transport_Automobile | transport_Bike | transport_Motorbike | transport_Public_Transportation | transport_Walking | Gender_Female | Gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 21 | 1.620000 | 64.000000 | 1 | 3 | 1 | 2 | 3 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 1 | 21 | 1.520000 | 56.000000 | 0 | 3 | 1 | 3 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2 | 23 | 1.800000 | 77.000000 | 1 | 3 | 1 | 2 | 1 | 1 | 2 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3 | 27 | 1.800000 | 87.000000 | 0 | 3 | 1 | 2 | 1 | 0 | 2 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 4 | 22 | 1.780000 | 89.800000 | 1 | 1 | 1 | 2 | 3 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | 21 | 1.710730 | 131.408528 | 0 | 3 | 1 | 2 | 1 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2107 | 22 | 1.748584 | 133.742943 | 0 | 3 | 1 | 2 | 2 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2108 | 23 | 1.752206 | 133.689352 | 0 | 3 | 1 | 2 | 2 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2109 | 24 | 1.739450 | 133.346641 | 0 | 3 | 1 | 3 | 2 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2110 | 24 | 1.738836 | 133.472641 | 0 | 3 | 1 | 3 | 2 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
2111 rows × 24 columns
We inverted the vegetables consumption and physical activity ranges. Now, the zero value means the highest frequency, whereas the 2 value means the lowest one.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2111 entries, 0 to 2110 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 2111 non-null int64 1 Height 2111 non-null float64 2 Weight 2111 non-null float64 3 vegetables 2111 non-null int64 4 n_meal 2111 non-null int64 5 eat_bet_meal 2111 non-null int64 6 water 2111 non-null int64 7 physical_active 2111 non-null int64 8 screen_time 2111 non-null int64 9 alcohol 2111 non-null int64 10 obesity 2111 non-null int64 11 family_no 2111 non-null uint8 12 family_yes 2111 non-null uint8 13 caloric_food_no 2111 non-null uint8 14 caloric_food_yes 2111 non-null uint8 15 monitor_cal_no 2111 non-null uint8 16 monitor_cal_yes 2111 non-null uint8 17 transport_Automobile 2111 non-null uint8 18 transport_Bike 2111 non-null uint8 19 transport_Motorbike 2111 non-null uint8 20 transport_Public_Transportation 2111 non-null uint8 21 transport_Walking 2111 non-null uint8 22 Gender_Female 2111 non-null uint8 23 Gender_Male 2111 non-null uint8 dtypes: float64(2), int64(9), uint8(13) memory usage: 208.3 KB
Now, we will apply PCA in order to check if there are any pair os atributes that co-vary one with another:
sns.set()
sns.pairplot(df, vars={"Height","Weight"},hue="Age", height=2)
<seaborn.axisgrid.PairGrid at 0x7f3501917880>
sns.set()
sns.pairplot(df,vars={"Height","Weight"},hue="Weight",height=3)
<seaborn.axisgrid.PairGrid at 0x7f3501af7b50>
Weight and Height does not correlate much with age. However, it seems to exist a correlation between Height and Weight. We will try doing PCA:
X=np.transpose(np.array([df.Weight,df.Height]))
pca=PCA(n_components=1)
pca.fit(X)
X_pca=pca.transform(X)
print(pca.components_)
[[0.99999864 0.00164991]]
The PCA vector shows that we could easily remove the Height attribute, without the need of the PCA. However, we will use this vector in order to project Weight and Height.
That projection is given by:
df.Weight*pca.components_[0,0]+df.Height*pca.components_[0,1]
0 64.002586
1 56.002432
2 77.002865
3 87.002851
4 89.802815
...
2106 131.411172
2107 133.745646
2108 133.692061
2109 133.349329
2110 133.475328
Length: 2111, dtype: float64
df.Weight=df.Weight*pca.components_[0,0]+df.Height*pca.components_[0,1]
del df['Height']
df
| Age | Weight | vegetables | n_meal | eat_bet_meal | water | physical_active | screen_time | alcohol | obesity | ... | caloric_food_yes | monitor_cal_no | monitor_cal_yes | transport_Automobile | transport_Bike | transport_Motorbike | transport_Public_Transportation | transport_Walking | Gender_Female | Gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 21 | 64.002586 | 1 | 3 | 1 | 2 | 3 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 1 | 21 | 56.002432 | 0 | 3 | 1 | 3 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2 | 23 | 77.002865 | 1 | 3 | 1 | 2 | 1 | 1 | 2 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3 | 27 | 87.002851 | 0 | 3 | 1 | 2 | 1 | 0 | 2 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 4 | 22 | 89.802815 | 1 | 1 | 1 | 2 | 3 | 0 | 1 | 2 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | 21 | 131.411172 | 0 | 3 | 1 | 2 | 1 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2107 | 22 | 133.745646 | 0 | 3 | 1 | 2 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2108 | 23 | 133.692061 | 0 | 3 | 1 | 2 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2109 | 24 | 133.349329 | 0 | 3 | 1 | 3 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2110 | 24 | 133.475328 | 0 | 3 | 1 | 3 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
2111 rows × 23 columns
Let's take a look into the Explained Variance, in order to check the amount of the original data contained in the PCA vector:
def plot_explained_variance(pca):
explained_var = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(explained_var)
plotly.offline.iplot({
"data": [Bar(y=explained_var, name='individual explained variance'),
Scatter(y=cum_var_exp, name='cumulative explained variance')
],
"layout": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))
})
X_pca = pca.fit(X)
plot_explained_variance(pca)
As it was already expected, the dimensionality reduction recovered over than 99% of the original data.
df
| Age | Weight | vegetables | n_meal | eat_bet_meal | water | physical_active | screen_time | alcohol | obesity | ... | caloric_food_yes | monitor_cal_no | monitor_cal_yes | transport_Automobile | transport_Bike | transport_Motorbike | transport_Public_Transportation | transport_Walking | Gender_Female | Gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 21 | 64.002586 | 1 | 3 | 1 | 2 | 3 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 1 | 21 | 56.002432 | 0 | 3 | 1 | 3 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2 | 23 | 77.002865 | 1 | 3 | 1 | 2 | 1 | 1 | 2 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3 | 27 | 87.002851 | 0 | 3 | 1 | 2 | 1 | 0 | 2 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
| 4 | 22 | 89.802815 | 1 | 1 | 1 | 2 | 3 | 0 | 1 | 2 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | 21 | 131.411172 | 0 | 3 | 1 | 2 | 1 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2107 | 22 | 133.745646 | 0 | 3 | 1 | 2 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2108 | 23 | 133.692061 | 0 | 3 | 1 | 2 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2109 | 24 | 133.349329 | 0 | 3 | 1 | 3 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2110 | 24 | 133.475328 | 0 | 3 | 1 | 3 | 2 | 1 | 1 | 6 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
2111 rows × 23 columns
Question: Divide your data into training and testing data using an 80% training and 20% testing split. Use the cross validation modules that are part of scikit-learn. Argue "for" or "against" splitting your data using an 80/20 split. That is, why is the 80/20 split appropriate (or not) for your dataset?
Age and Weight attributes have a wide range. Therefore, we will normalize them before we proceed.
y=df.obesity
del df['obesity']
norm_features=['Age','Weight']
df[norm_features]=(df[norm_features]-df[norm_features].mean())/df[norm_features].std()
del X
X=df.to_numpy()
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(
n_splits=num_cv_iterations,
test_size = 0.2)
print(cv_object)
ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)
Now, we will validate it using sklearn .
valid_split=SKLogisticRegression(class_weight='balanced',multi_class='multinomial')
iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in cv_object.split(X,y):
# I will create new variables here so that it is more obvious what
# the code is doing (you can compact this syntax and avoid duplicating memory,
# but it makes this code less readable)
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
# train the reusable logisitc regression model on the training data
valid_split.fit(X_train,y_train) # train object
y_hat = valid_split.predict(X_test) # get test set precitions
# now let's get the accuracy and confusion matrix for this iterations of training/testing
acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print("====Iteration",iter_num," ====")
print("accuracy", acc )
print("confusion matrix\n",conf)
iter_num+=1
====Iteration 0 ==== accuracy 0.789598108747045 confusion matrix [[29 5 3 0 20 0 0] [ 4 52 13 0 0 0 0] [ 2 7 33 13 0 1 0] [ 0 6 6 58 0 3 1] [ 3 0 0 0 56 0 0] [ 0 0 1 0 0 49 1] [ 0 0 0 0 0 0 57]] ====Iteration 1 ==== accuracy 0.8108747044917257 confusion matrix [[29 4 3 0 18 0 0] [ 1 38 16 4 0 0 0] [ 0 6 49 8 0 2 0] [ 0 1 8 54 0 4 1] [ 3 0 0 0 43 0 0] [ 0 0 0 1 0 74 0] [ 0 0 0 0 0 0 56]] ====Iteration 2 ==== accuracy 0.7801418439716312 confusion matrix [[35 11 2 0 12 0 0] [ 2 44 8 4 0 0 0] [ 2 9 29 14 0 2 0] [ 0 2 15 48 0 3 1] [ 4 0 0 0 55 0 0] [ 0 0 0 0 0 44 2] [ 0 0 0 0 0 0 75]]
Apparently, our model could predict about 80% of the original data (using SkLearn modules). The result seems to be suitable, but before any conclusion, let's use numpy and check its accuracy.
Question: Create a custom, one-versus-all logistic regression classifier using numpy and scipy to optimize. Use object oriented conventions identical to scikit-learn. You should start with the template developed by the instructor in the course. You should add the following functionality to the logistic regression classifier:
class BinaryLogisticRegression:
def __init__(self, eta, iterations=20, C=0.001):
self.eta = eta
self.iters = iterations
self.C = C
# internally we will store the weights as self.w_ to keep with sklearn conventions
def __str__(self):
if(hasattr(self,'w_')):
return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
else:
return 'Untrained Binary Logistic Regression Object'
# convenience, private:
@staticmethod
def _add_bias(X):
return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
@staticmethod
def _sigmoid(theta):
# increase stability, redefine sigmoid operation
return expit(theta) #1/(1+np.exp(-theta))
# vectorized gradient calculation with regularization using L2 Norm
def _get_gradient(self,X,y):
ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
gradient = gradient.reshape(self.w_.shape)
gradient[1:] += -2 * self.w_[1:] * self.C
return gradient
# public:
def predict_proba(self,X,add_bias=True):
# add bias term if requested
Xb = self._add_bias(X) if add_bias else X
return self._sigmoid(Xb @ self.w_) # return the probability y=1
def predict(self,X):
return (self.predict_proba(X)>0.5) #return the actual prediction
def fit(self, X, y):
Xb = self._add_bias(X) # add bias term
num_samples, num_features = Xb.shape
self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
# for as many as the max iterations
for _ in range(self.iters):
gradient = self._get_gradient(Xb,y)
self.w_ += gradient*self.eta # multiply by learning rate
# add bacause maximizing
class MultiClassLogisticRegression:
def __init__(self, eta, iterations=20,
C=0.0001,
solver=BinaryLogisticRegression):
self.eta = eta
self.iters = iterations
self.C = C
self.solver = solver
self.classifiers_ = []
# internally we will store the weights as self.w_ to keep with sklearn conventions
def __str__(self):
if(hasattr(self,'w_')):
return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
else:
return 'Untrained MultiClass Logistic Regression Object'
def fit(self,X,y):
num_samples, num_features = X.shape
self.unique_ = np.sort(np.unique(y)) # get each unique class value
num_unique_classes = len(self.unique_)
self.classifiers_ = []
for i,yval in enumerate(self.unique_): # for each unique value
y_binary = np.array(y==yval).astype(int) # create a binary problem
# train the binary classifier for this class
hblr = self.solver(eta=self.eta,iterations=self.iters,C=self.C)
hblr.fit(X,y_binary)
# add the trained classifier to the list
self.classifiers_.append(hblr)
# save all the weights into one matrix, separate column for each class
self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
def predict_proba(self,X):
probs = []
for hblr in self.classifiers_:
probs.append(hblr.predict_proba(X).reshape((len(X),1))) # get probability for each classifier
return np.hstack(probs) # make into single matrix
def predict(self,X):
return np.argmax(self.predict_proba(X),axis=1) # take argmax along row
Let's add the classes for each optimization technique:
class HessianBinaryLogisticRegression(BinaryLogisticRegression):
# just overwrite gradient function
def _get_gradient(self,X,y):
g = self.predict_proba(X,add_bias=False).ravel() # get sigmoid value for all classes
hessian = X.T @ np.diag(g*(1-g)) @ X - 2 * self.C # calculate the hessian
ydiff = y-g # get y difference
gradient = np.sum(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
gradient = gradient.reshape(self.w_.shape)
gradient[1:] += -2 * self.w_[1:] * self.C
return pinv(hessian) @ gradient
newton=MultiClassLogisticRegression(eta=0.05,solver=HessianBinaryLogisticRegression,iterations=20)
print(newton)
Untrained MultiClass Logistic Regression Object
newton.fit(X_train,y_train)
newton.predict(X_test)
print(accuracy_score(y_test,y_hat))
0.7801418439716312
class BFGSBinaryLogisticRegression(BinaryLogisticRegression):
@staticmethod
def objective_function(w,X,y,C):
g = expit(X @ w)
# invert this because scipy minimizes, but we derived all formulas for maximzing
return -np.sum(ma.log(g[y==1]))-np.sum(ma.log(1-g[y==0])) + C*sum(w**2)
#-np.sum(y*np.log(g)+(1-y)*np.log(1-g))
@staticmethod
def objective_gradient(w,X,y,C):
g = expit(X @ w)
ydiff = y-g # get y difference
gradient = np.mean(X * ydiff[:,np.newaxis], axis=0)
gradient = gradient.reshape(w.shape)
gradient[1:] += -2 * w[1:] * C
return -gradient
# just overwrite fit function
def fit(self, X, y):
Xb = self._add_bias(X) # add bias term
num_samples, num_features = Xb.shape
self.w_ = fmin_bfgs(self.objective_function, # what to optimize
np.zeros((num_features,1)), # starting point
fprime=self.objective_gradient, # gradient function
args=(Xb,y,self.C), # extra args for gradient and objective function
gtol=1e-03, # stopping criteria for gradient, |v_k|
maxiter=self.iters, # stopping criteria iterations
disp=False)
self.w_ = self.w_.reshape((num_features,1))
quasi_newton=MultiClassLogisticRegression(eta=0.05,solver=BFGSBinaryLogisticRegression,iterations=20)
print(quasi_newton)
Untrained MultiClass Logistic Regression Object
quasi_newton.fit(X_train,y_train)
quasi_newton.predict(X_test)
print(accuracy_score(y_test,y_hat))
0.7801418439716312
Question: Train your classifier to achieve good generalization performance. That is, adjust the optimization technique and the value of the regularization term(s) "C" to achieve the best performance on your test set. Visualize the performance of the classifier versus the parameters you investigated. Is your method of selecting parameters justified? That is, do you think there is any "data snooping" involved with this method of selecting parameters?
Question: Compare the performance of your "best" logistic regression optimization procedure to the procedure used in scikit-learn. Visualize the performance differences in terms of training time and classification performance. Discuss the results.
Question: Which implementation of logistic regression would you advise be used in a deployed machine learning model, your implementation or scikit-learn (or other third party)? Why?
Question: Choose ONE of the following: